import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense
# Concatenating all files using glob library
joined_files = os.path.join("Sales*.csv")
joined_list = glob.glob(joined_files)
df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
# Def function for dataframe details:
def check_df(dataframe, head=5):
print(" SHAPE ".center(70,'-'))
print('Rows: {}'.format(dataframe.shape[0]))
print('Columns: {}'.format(dataframe.shape[1]))
print(" TYPES ".center(70,'-'))
print(dataframe.dtypes)
print(" HEAD ".center(70,'-'))
print(dataframe.head(head))
print(" TAIL ".center(70,'-'))
print(dataframe.tail(head))
print(" MISSING VALUES ".center(70,'-'))
print(dataframe.isnull().sum())
print(" DUPLICATED VALUES ".center(70,'-'))
print(dataframe.duplicated().sum())
print(" DESCRIBE ".center(70,'-'))
print(dataframe.describe())
check_df(df)
------------------------------- SHAPE --------------------------------
Rows: 186850
Columns: 6
------------------------------- TYPES --------------------------------
Order ID object
Product object
Quantity Ordered object
Price Each object
Order Date object
Purchase Address object
dtype: object
-------------------------------- HEAD --------------------------------
Order ID Product Quantity Ordered Price Each \
0 176558 USB-C Charging Cable 2 11.95
1 NaN NaN NaN NaN
2 176559 Bose SoundSport Headphones 1 99.99
3 176560 Google Phone 1 600
4 176560 Wired Headphones 1 11.99
Order Date Purchase Address
0 04/19/19 08:46 917 1st St, Dallas, TX 75001
1 NaN NaN
2 04/07/19 22:30 682 Chestnut St, Boston, MA 02215
3 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001
4 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001
-------------------------------- TAIL --------------------------------
Order ID Product Quantity Ordered Price Each \
186845 259353 AAA Batteries (4-pack) 3 2.99
186846 259354 iPhone 1 700
186847 259355 iPhone 1 700
186848 259356 34in Ultrawide Monitor 1 379.99
186849 259357 USB-C Charging Cable 1 11.95
Order Date Purchase Address
186845 09/17/19 20:56 840 Highland St, Los Angeles, CA 90001
186846 09/01/19 16:00 216 Dogwood St, San Francisco, CA 94016
186847 09/23/19 07:39 220 12th St, San Francisco, CA 94016
186848 09/19/19 17:30 511 Forest St, San Francisco, CA 94016
186849 09/30/19 00:18 250 Meadow St, San Francisco, CA 94016
--------------------------- MISSING VALUES ---------------------------
Order ID 545
Product 545
Quantity Ordered 545
Price Each 545
Order Date 545
Purchase Address 545
dtype: int64
------------------------- DUPLICATED VALUES --------------------------
1162
------------------------------ DESCRIBE ------------------------------
Order ID Product Quantity Ordered Price Each \
count 186305 186305 186305 186305
unique 178438 20 10 24
top Order ID USB-C Charging Cable 1 11.95
freq 355 21903 168552 21903
Order Date Purchase Address
count 186305 186305
unique 142396 140788
top Order Date Purchase Address
freq 355 355
# Dropping Null Values
df = df.dropna(how='all')
# Value Counts
df['Quantity Ordered'].value_counts()
Quantity Ordered 1 168552 2 13324 3 2920 4 806 Quantity Ordered 355 5 236 6 80 7 24 8 5 9 3 Name: count, dtype: int64
# Value Counts
df['Price Each'].value_counts()
Price Each 11.95 21903 14.95 21658 2.99 20641 3.84 20577 11.99 18882 150 15450 99.99 13325 149.99 7507 700 6804 389.99 6230 379.99 6181 600 5490 300 4780 1700 4702 999.99 4128 109.99 4101 400 2056 600.0 1347 Price Each 355 150.0 99 700.0 38 1700.0 22 300.0 20 400.0 9 Name: count, dtype: int64
# Dropping rows with unnecessary values from the features
df.drop(df[df['Quantity Ordered'] == 'Quantity Ordered'].index, inplace = True)
df.drop(df[df['Price Each'] == 'Price Each'].index, inplace = True)
# Converting the datatypes to int and float of respective features
df['Quantity Ordered'] = df['Quantity Ordered'].astype(int)
df['Price Each'] = df['Price Each'].astype(float)
# Converting Feature into Datetime feature
df['Order Date'] = pd.to_datetime(df['Order Date'])
C:\Users\KaMaL RaJiT\AppData\Local\Temp\ipykernel_10908\3171770558.py:3: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
# Extracting City from the Address for ease of Analysis
df['City'] = df['Purchase Address'].str.split(',', expand = True)[1]
# Creating new feature 'Revenue', which is Selling Price * Quantity
df['Revenue'] = df['Quantity Ordered'] * df['Price Each']
# Extracting date, month, quarter from datetime feature
df['Date'] = df['Order Date'].dt.date
df['Month'] = df['Order Date'].dt.month
df['Quarter'] = df['Order Date'].dt.quarter
# Mapping Month names for the feature
df['Month'] = df['Month'].map({1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'June',
7:'July', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'})
df.head()
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | City | Revenue | Date | Month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 2019-04-19 08:46:00 | 917 1st St, Dallas, TX 75001 | Dallas | 23.90 | 2019-04-19 | Apr | 2 |
| 2 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 2019-04-07 22:30:00 | 682 Chestnut St, Boston, MA 02215 | Boston | 99.99 | 2019-04-07 | Apr | 2 |
| 3 | 176560 | Google Phone | 1 | 600.00 | 2019-04-12 14:38:00 | 669 Spruce St, Los Angeles, CA 90001 | Los Angeles | 600.00 | 2019-04-12 | Apr | 2 |
| 4 | 176560 | Wired Headphones | 1 | 11.99 | 2019-04-12 14:38:00 | 669 Spruce St, Los Angeles, CA 90001 | Los Angeles | 11.99 | 2019-04-12 | Apr | 2 |
| 5 | 176561 | Wired Headphones | 1 | 11.99 | 2019-04-30 09:27:00 | 333 8th St, Los Angeles, CA 90001 | Los Angeles | 11.99 | 2019-04-30 | Apr | 2 |
df.dtypes
Order ID object Product object Quantity Ordered int32 Price Each float64 Order Date datetime64[ns] Purchase Address object City object Revenue float64 Date object Month object Quarter int32 dtype: object
quantity_ordered = df.groupby('Product')['Quantity Ordered'].sum().sort_values(ascending=False).reset_index()
fig1 = px.bar(quantity_ordered, x='Product', y='Quantity Ordered', text = 'Quantity Ordered',
hover_data=['Product', 'Quantity Ordered'], color='Quantity Ordered')
fig1.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig1.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig1.update_layout(title_text='Most Ordered Products')
fig1.show()
order_per_month = df.groupby('Month')['Order ID'].nunique().reset_index()
colors = ['lightslategray',] * 12
colors[2] = 'crimson'
fig2 = go.Figure(data=[go.Bar(
x= order_per_month['Month'],
y= order_per_month['Order ID'],
text = order_per_month['Order ID'],
marker_color=colors
)])
fig2.update_traces(texttemplate="%{text:.2s}", textposition='outside')
fig2.update_layout(uniformtext_minsize=2, uniformtext_mode='hide')
fig2.update_layout(title_text='Highest Orders in a Month')
order_per_city = df.groupby('City')['Order ID'].nunique().reset_index()
colors = ['lightslategray',] * 12
colors[7] = 'crimson'
fig3 = go.Figure(data=[go.Bar(
x= order_per_city['City'],
y= order_per_city['Order ID'],
text = order_per_city['Order ID'],
marker_color=colors
)])
fig3.update_traces(texttemplate="%{text:.2s}", textposition='outside')
fig3.update_layout(uniformtext_minsize=2, uniformtext_mode='hide')
fig3.update_layout(title_text='Highest Orders Per City')
high_revenue_products = df.groupby('Product')['Revenue'].sum().sort_values(ascending=False).head(10).reset_index()
fig4 = px.line(df, x='Product', y='Revenue', color='Product', markers=True, hover_data='Revenue')
fig4.update_layout(title = "Top Revenue Generated Products")
fig4.show()
least_revenue_products = df.groupby('Product')['Revenue'].sum().sort_values(ascending =True).tail(5).reset_index()
fig5 = px.bar(least_revenue_products, x = 'Product', y = 'Revenue')
fig5.update_layout(title="Least 5 Revenue Generated Products")
fig5.show()
monthly_revenue = df.groupby('Month')['Revenue'].sum().sort_values(ascending=False).reset_index()
labels = monthly_revenue['Month']
values = monthly_revenue['Revenue']
fig6 = go.Figure(data=[go.Pie(labels=labels, values=values, text = monthly_revenue['Revenue'], pull=[0.2, 0, 0, 0])])
fig6.update_traces(texttemplate="%{text:.2s}", textfont_size=15, textposition='outside',
marker=dict(line=dict(color='#000000', width=2)))
fig6.update_layout(title_text='Revenue Generated Monthly')
revenue_city = df.groupby('City')['Revenue'].sum().sort_values(ascending=False).reset_index()
fig7 = px.scatter(revenue_city, x="Revenue", y="City",
size="Revenue", color="City",
hover_name="City", log_x=False, size_max=70)
fig7.update_layout(title = 'Scatter of Revenue by City')
fig7.show()
product_price = df.groupby('Product')['Price Each'].max().sort_values(ascending=False).head().reset_index()
product_price
| Product | Price Each | |
|---|---|---|
| 0 | Macbook Pro Laptop | 1700.00 |
| 1 | ThinkPad Laptop | 999.99 |
| 2 | iPhone | 700.00 |
| 3 | LG Dryer | 600.00 |
| 4 | LG Washing Machine | 600.00 |
fig8 = px.histogram(df, x = 'Revenue', y = 'Quantity Ordered', color = 'City', marginal="box",
hover_data = df.columns)
fig8.show()